{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 测试8个GPU之间的HCCL通信带宽的脚本\n",
    "\n",
    "- **平均测试时间**：是指某一算法先做m次预热，再做n次测试，并从n次测试开始计时，计算算法完成时平均每次迭代的耗时\n",
    "\n",
    "- **算法带宽**：是指申请内存大小/平均时间的数据，包含数据传输、计算和内存复制的带宽"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import os, time, utils\n",
    "from glob import glob\n",
    "\n",
    "# 获取程序运行的根目录\n",
    "root_path = utils.get_rootpath()\n",
    "log_path = f\"{root_path}/log\"\n",
    "# 其实内存容量（单位Byte）\n",
    "test_mem_start = 2048 * 1024 * 1024\n",
    "# 结束的内存容量（单位Byte）\n",
    "test_mem_stop = 2048 * 1024 * 1024\n",
    "# 增长倍数\n",
    "test_mem_factor = 2\n",
    "# 定义算法的名字和对应的绘图的颜色\n",
    "test_func = [\n",
    "    \"all_gather\",\n",
    "    \"all_reduce\",\n",
    "    \"alltoall\",\n",
    "    \"alltoallv\",\n",
    "    \"broadcast\",\n",
    "    \"reduce\",\n",
    "    \"reduce_scatter\",\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### A. 清理工作环境"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root_path: /root/workdir/hccl_test\n"
     ]
    }
   ],
   "source": [
    "print(f\"root_path: {root_path}\")\n",
    "os.system(f\"rm -rf {log_path}\")\n",
    "os.chdir(root_path)\n",
    "os.makedirs(log_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### B. 重新编译所有的hccl方法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir(f\"{root_path}/hccl\")\n",
    "os.system(\"make clean\")\n",
    "os.system(\"make\")\n",
    "os.chdir(root_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 获取计算列表"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### A. 根据参数自动生成列表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mem_list len: 7\n"
     ]
    }
   ],
   "source": [
    "mem_list = []\n",
    "mem_size = test_mem_start\n",
    "while mem_size <= test_mem_stop:\n",
    "    for func in test_func:\n",
    "        mem_list.append([func, mem_size])\n",
    "    mem_size = mem_size * 2\n",
    "mem_count = len(mem_list)\n",
    "print(f\"mem_list len: {mem_count}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### B. 手动指定处理列表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mem_list len: 1\n"
     ]
    }
   ],
   "source": [
    "mem_list = [\n",
    "    # [\"all_gather\", 2048 * 1024 * 1024],\n",
    "    [\"all_reduce\", 2048 * 1024 * 1024],\n",
    "    # [\"alltoall\", 2048 * 1024 * 1024],\n",
    "    # [\"alltoallv\", 2048 * 1024 * 1024],\n",
    "    # [\"broadcast\", 2048 * 1024 * 1024],\n",
    "    # [\"reduce\", 2048 * 1024 * 1024],\n",
    "    # [\"reduce_scatter\", 2048 * 1024 * 1024],\n",
    "]\n",
    "mem_count = len(mem_list)\n",
    "print(f\"mem_list len: {mem_count}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 运行测试程序"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### A. 性能测试\n",
    "\n",
    "该测试脚本主要利用msprof工具抓取trace性能工具，运行对应参数的方法，并将多个节点的结果合并到一起，最终的产物为位于log文件夹下的timeline的json文件，通过trace分析工具可以直接导入并分析该文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "001 / 001 (100.00%) >> run   all_reduce_test in 2.0GB...done in 00:02:59\n",
      "001 / 001 (100.00%) >> merge all_reduce timeline json...done in 00:00:24\n",
      "001 / 001 (100.00%) >> proc  all_reduce total use: 00:03:23\n"
     ]
    }
   ],
   "source": [
    "utils.load_env()\n",
    "npus = os.environ[\"HCCL_TEST_USE_DEVS\"].split(\",\")\n",
    "for i in range(len(mem_list)):\n",
    "    # 准备基础变量, 打印前缀信息\n",
    "    global_start = time.time()\n",
    "    local_start = time.time()\n",
    "    [func, mem_size] = mem_list[i]\n",
    "    prt_precent = f\"{((i + 1.0) / mem_count * 100):02.2f}%\"\n",
    "    prt_prefix = f\"{i + 1:03d} / {len(mem_list):03d} ({prt_precent})\"\n",
    "    print(f\"{prt_prefix} >> run   {func}_test in {utils.get_size(mem_size)}...\", end=\"\")\n",
    "    # 准备后续操作需要的变量\n",
    "    mpirun_template = f\"{root_path}/script/mpirun.template\"\n",
    "    mpirun_script = f\"{log_path}/mpirun_script.sh\"\n",
    "    msprof_template = f\"{root_path}/script/msprof.template\"\n",
    "    mpirun_args = {\n",
    "        \"npus\": len(npus),\n",
    "        \"mem_size\": mem_size,\n",
    "        \"exec\": f\"{root_path}/bin/{func}_test\",\n",
    "        \"log_path\": f\"{log_path}/{func}_{utils.get_size(mem_size)}_{len(npus)}npus.log\",\n",
    "    }\n",
    "    msprof_args = {\n",
    "        \"script_path\": mpirun_script,\n",
    "        \"prof_path\": f\"{log_path}/tmp\",\n",
    "        \"log_path\": f\"{log_path}/mpirun_script.log\",\n",
    "    }\n",
    "    # 删除旧的临时文件, 并开始性能测试\n",
    "    os.system(f\"rm -rf {log_path}/tmp\")\n",
    "    utils.get_script(mpirun_template, mpirun_args, mpirun_script)\n",
    "    os.system(utils.get_script(msprof_template, msprof_args))\n",
    "    print(f\"done in {utils.get_time(local_start)}\")\n",
    "    # 开始合并device侧的timeline文件\n",
    "    local_start = time.time()\n",
    "    print(f\"{prt_prefix} >> merge {func} timeline json...\", end=\"\")\n",
    "    msprof_timeline_src = f\"{log_path}/tmp/PROF_*\"\n",
    "    msprof_timeline_dst = f\"{log_path}/{func}_{utils.get_size(mem_size)}\"\n",
    "    utils.merge_timeline(glob(msprof_timeline_src), msprof_timeline_dst)\n",
    "    print(f\"done in {utils.get_time(local_start)}\")\n",
    "    print(f\"{prt_prefix} >> proc  {func} total use: {utils.get_time(global_start)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### B. 普通测试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "utils.load_env()\n",
    "npus = os.environ[\"HCCL_TEST_USE_DEVS\"].split(\",\")\n",
    "for i in range(len(mem_list)):\n",
    "    # 初始化变量\n",
    "    local_start = time.time()\n",
    "    [func, mem_size] = mem_list[i]\n",
    "    prt_precent = f\"{((i + 1.0) / mem_count * 100):02.2f}%\"\n",
    "    prt_prefix = f\"{i + 1:03d} / {len(mem_list):03d} ({prt_precent})\"\n",
    "    print(f\"{prt_prefix} >> {func} in {utils.get_size(mem_size)}...\", end=\"\")\n",
    "    # 设置路径, 运行脚本\n",
    "    script_path = f\"{root_path}/script/mpirun.template\"\n",
    "    args_dict = {\n",
    "        \"npus\": len(npus),\n",
    "        \"mem_size\": mem_size,\n",
    "        \"exec\": f\"{root_path}/bin/{func}_test\",\n",
    "        \"log_path\": f\"{log_path}/{func}_{len(npus)}npus.log\",\n",
    "    }\n",
    "    os.system(utils.get_script(script_path, args_dict))\n",
    "    print(f\"done in {utils.get_time(local_start)}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
